---
title: "Ethnic Stacking in the Russian Armed Forces? Findings From A Leaked Dataset"
subtitle: "Appendix"
author: "Driscoll, Sichinava, Berglund"
format: docx
editor: visual
---

```{r, echo = F, message = F, include = F}
#| label: packages

required_packages <- c(
  "tidyverse",
  "ggforce",
  "extrafont",
  "readxl",
  "knitr",
  "MASS",
  "ordinal",
  "gt",
  "sf",
  "ggspatial",
  "classInt",
  "cowplot",
  "ggrepel",
  "ggpattern",
  "pscl",
  "marginaleffects",
  "texreg",
  "stargazer",
  "extrafont"
)

# Function to install and load packages

install_and_load <- function(package) {
  if (!require(package, character.only = TRUE)) {
    install.packages(package, dependencies = TRUE)
    library(package, character.only = TRUE)
  }
}

# Apply the function to all required packages
sapply(required_packages, install_and_load)

loadfonts(device = "win")

```

```{r, echo = F, message = F, include = F}
#| label: data
read_rds("ruaf_analysis_full_anon.rds") |> 
  mutate(
    category = factor(category),
    quintile_group = factor(quintile_group),
    others = 1-BelRusUkr_agr, 
    agr_ethnos_f = factor(agr_ethnos),
    agr_ethnos_f = relevel(agr_ethnos_f, ref = "BelRusUkr")
  ) |> 
  rename(
    "Prob_non_es_memo" = "ethn_non_e_slavs",
    "Prob_non_es_bess" = "others",
  )-> ruaf_analysis_full

ruaf_analysis_full |> 
  mutate(
    all_es = case_when(
      is.na(Prob_non_es_memo) ~ 0,
      T ~ as.double(Prob_non_es_memo)
    ),
    all_nese = case_when(
      is.na(Prob_non_es_memo) ~ 1,
      T ~ as.double(Prob_non_es_memo)
    )
  ) -> ruaf_imp

```

# Description of the Raw Data

```{r, echo = F, message = F}

rank_data <- tibble(
  `Ordinal Rank` = 0:15,
  `Ground Force Rank` = c(
    "Ryadovoy", "Yefreytor", "Mladshiy serzhant", "Serzhant", 
    "Starshiy serzhant", "Starshina", "Praporshchik", 
    "Starshiy praporshchik", "Mladshiy leytenant", "Leytenant", 
    "Starshey leytenant", "Kapitan", "Mayor", 
    "Podpolkovnik", "Polkovnik", "General-mayor"
  ),
  `Naval Force Rank` = c(
    "Matros", "Starshy matros", "Starshina 2 statji", 
    "Starshina 1 statji", "Glavny starshina", 
    "Glavny korabelny starshina", "Michman", 
    "Starshy michman", "Mladshiy leytenant", "Leytenant", 
    "Starshey leytenant", "Kapitan-leytenant", 
    "Kapitan 3-go ranga", "Kapitan 2-go ranga", 
    "Kapitan 1-go ranga", "Contre-admiral"
  ),
  `Generalized Rank` = c(
    "Enlisted", "Enlisted", "Enlisted", "Enlisted", 
    "Enlisted", "Enlisted", "Sergeant", "Sergeant", 
    "Junior officer", "Junior officer", "Junior officer", 
    "Junior officer", "Senior officer", "Senior officer", 
    "Senior officer", "General officer"
  )
)

rank_data |> 
  gt() |> 
  gt::cols_label(
    `Ordinal Rank` = "Ordinal Rank",
    `Ground Force Rank` = "Ground Force Rank",
    `Naval Force Rank` = "Naval Force Rank",
    `Generalized Rank` = "Generalized Rank"
  ) |>
  fmt_number(
    columns = `Ordinal Rank`,
    decimals = 0
  ) |> 
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  ) |> 
  tab_style(
    style = cell_text(style = "italic"),
    locations = cells_body(columns = c(`Ground Force Rank`, `Naval Force Rank`))
  ) -> tbl_ranks

tbl_ranks |> 
  tab_header(title = "Russian military ranks")

tbl_ranks |> 
  as_latex() |>
  writeLines(con = "tbl_ranks.tex")
```

"

"

```{r, echo = F, message = F}

ruaf_analysis_full |>
  filter(rank_known == 1) |> 
  group_by(
    category, rank_large_f
  ) |> 
  count() |> 
  ungroup() |> 
  mutate(
    category = case_when(
      is.na(category) ~ "Others",
      T ~ as.character(category)
    )
  ) |> 
  pivot_wider(
    names_from = "category",
    values_from = "n"
  )-> counts_for_table

column_names <- names(counts_for_table)[-1]

ruaf_analysis_full |>
  filter(rank_known == 1) |> 
  group_by(
    category, rank_large_f
  ) |> 
  count() |> 
  ungroup() |> 
  group_by(category) |> 
  mutate(
    proportion = paste0(round(n/sum(n), 3)*100, "%"),
    category = case_when(
      is.na(category) ~ "Others",
      T ~ as.character(category)
    )
  ) |> 
  ungroup() |> 
  dplyr::select(-n) |> 
    pivot_wider(
    names_from = "category",
    values_from = "proportion"
  ) -> props_for_table

counts_for_table |>
  left_join(props_for_table, by = "rank_large_f") |> 
  set_names(c("Generalized rank", rep(column_names, 2))) |> 
  janitor::clean_names() |> 
  dplyr::select(
    c("generalized_rank", "non_eastern_slav_majority", "non_eastern_slavs_constitute_significant_minority", "non_eastern_slavs_constitute_a_small_minority", "others", "non_eastern_slav_majority_2", "non_eastern_slavs_constitute_significant_minority_2", "non_eastern_slavs_constitute_a_small_minority_2", "others_2")
  ) |> 
  gt::gt() |> 
  gt::cols_label(
      non_eastern_slav_majority = "NESE 50%+",
      non_eastern_slavs_constitute_significant_minority = "NESE 20-50%",
      non_eastern_slavs_constitute_a_small_minority = "NESE <20%",
      others = "Others",
      non_eastern_slav_majority_2 = "NESE 50%+",
      non_eastern_slavs_constitute_significant_minority_2 = "NESE 20-50%",
      non_eastern_slavs_constitute_a_small_minority_2 = "NESE <20%",
      others_2 = "Others"
  ) |> 
  gt::tab_spanner(label = "Total count", columns = c(
    non_eastern_slav_majority, non_eastern_slavs_constitute_significant_minority, non_eastern_slavs_constitute_a_small_minority, others
  )) |> 
  gt::tab_spanner(label = "% within the rank", columns = c(
    non_eastern_slav_majority_2, non_eastern_slavs_constitute_significant_minority_2, 
    non_eastern_slavs_constitute_a_small_minority_2, others_2
  )) -> tbl_ethn_by_regcomp

tbl_ethn_by_regcomp |> 
  tab_header(
    title = "Distribution by the type of the administrative region of the Russian Federation. Typologies are derived from the ethnic/national composition of regions as per the 2020-2021 National Census."
  )

tbl_ethn_by_regcomp |> 
  as_latex() |>
  writeLines(con = "tbl-ethn-by-regcomp.tex")
```

"

"

```{r, echo = F, message = F}
ruaf_analysis_full |>
  filter(rank_known == 1) |> 
  group_by(
    quintile_group, rank_large_f
  ) |> 
  count() |> 
  ungroup() |> 
  mutate(
    quintile_group = case_when(
      is.na(quintile_group) ~ "Others",
      T ~ as.character(quintile_group)
    )
  ) |> 
  pivot_wider(
    names_from = "quintile_group",
    values_from = "n"
  )-> counts_for_table

column_names <- names(counts_for_table)[-1]

ruaf_analysis_full |>
  filter(rank_known == 1) |> 
  group_by(
    quintile_group, rank_large_f
  ) |> 
  count() |> 
  ungroup() |> 
  group_by(quintile_group) |> 
  mutate(
    proportion = paste0(round(n/sum(n), 3)*100, "%"),
    quintile_group = case_when(
      is.na(quintile_group) ~ "Others",
      T ~ as.character(quintile_group)
    )
  ) |> 
  ungroup() |> 
  dplyr::select(-n) |> 
    pivot_wider(
    names_from = "quintile_group",
    values_from = "proportion"
  ) -> props_for_table

counts_for_table |>
  left_join(props_for_table, by = "rank_large_f") |>
  set_names(c("Generalized rank", rep(column_names, 2))) |> 
  janitor::clean_names() |>
  gt::gt() |> 
  gt::cols_label(
      generalized_rank = "Generalized rank",
      x1 = "1 (lowest)",
      x2 = "2",
      x3 = "3",
      x4 = "4",
      x5 = "5",
      others = "Others",
      x1_2 = "1 (lowest)",
      x2_2 = "2",
      x3_2 = "3",
      x4_2 = "4",
      x5_2 = "5",
      others_2 = "Others",
  ) |> 
  gt::tab_spanner(label = "Total count", columns = c(x1:others),
  ) |> 
  gt::tab_spanner(label = "% within the rank", columns = c(
    x1_2:others_2,
  )) -> tbl_rank_by_econreg

tbl_rank_by_econreg |> 
  tab_header(
    title = "Distribution by the type of administrative division of the Russian Federation as per economic class. Typologies are derived from the gross regional product estimates by Rosstat as of 2021"
  )

tbl_rank_by_econreg |> 
  as_latex() |>
  writeLines(con = "tbl-rank-by-econreg.tex")
```

"

"

```{r, echo = F, message =F}
ruaf_analysis_full |>
  filter(!is.na(geography)) |> 
  group_by(
    geography, rank_known
  ) |> 
  count() |> 
  pivot_wider(
    names_from = "rank_known",
    values_from = "n"
  ) |> 
  arrange(desc(`1`)) |> 
  mutate(
    geography = stringi::stri_trans_general(geography, "russian-latin/bgn")
  ) |>
  ungroup() |> 
  set_names(
    "Geography", "Records with no information on military ranks", "Records with information on military ranks"
  ) |> 
  gt::gt() -> tbl_rank_by_reg

tbl_rank_by_reg |> 
  tab_header(
    title = "Distribution of ranks by administrative regions"
  )

tbl_rank_by_reg |> 
  as_latex() |>
  writeLines(con = "tbl-rank-by-reg.tex")

```

# Data Transformation and Coding

```{r fig-violin-charts-mem, echo = F, message = F, warning=F, fig.dpi=600, fig.cap="Distribution of inferred probabilities from Memorial classifier of Eastern Slav / national minority status among service personnel. Only those records that have identifiable rank are presented."}

ruaf_analysis_full |>
  filter(rank_known == 1) |> 
  ggplot(aes(rank_large_f, Prob_non_es_memo, fill = rank_large_f, color = rank_large_f))+
  geom_violin()+
  scale_color_manual(values = rep(c("#666666"), 5))+
  scale_fill_manual(values = c("#666666", "#919191", "#b3b3b3", "#d9d9d9", "#efefef"))+
  guides(color=guide_legend(title="Military ranks"))+
  labs(
    x = "Ranks",
    y = "Probability of a last name being NESE"
  ) +
  theme_bw()+
  theme(
    legend.position = "none",
    axis.text = element_text(family = "Arial Narrow", size = 9),
    axis.title = element_text(family = "Arial Narrow", size = 12),
  ) -> fig_violin_charts_mem


fig_violin_charts_mem

ggsave(plot = fig_violin_charts_mem, filename = "fig_violin_charts_mem.pdf", dpi = 600, width = 7015, height = 4960, units = "px")
```

"

"

```{r fig-violin-charts-b, echo = F, message = F, warning = F, fig.dpi=600, fig.cap = "Distribution of inferred probabilities from the Bessudnov et. al. classifier of Eastern Slav / national minority status among service personnel. Only those records that have identifiable rank are presented."}

ruaf_analysis_full |>
  filter(rank_known == 1, !is.na(Prob_non_es_memo)) |>
  mutate(
    bessudnov_oth = 1-BelRusUkr_agr
  ) |> 
  ggplot(aes(rank_large_f, bessudnov_oth, fill = rank_large_f, color = rank_large_f))+
  geom_violin()+
  scale_color_manual(values = rep(c("#666666"), 5))+
  scale_fill_manual(values = c("#666666", "#919191", "#b3b3b3", "#d9d9d9", "#efefef"))+
  guides(color=guide_legend(title="Military ranks"))+
  labs(
    x = "Ranks",
    y = "Probability of a last name being NESE\n(Bessudnov et. al. algorithm)"
  ) +
  theme_bw()+
  theme(
    legend.position = "none",
    axis.text = element_text(family = "Arial Narrow", size = 9),
    axis.title = element_text(family = "Arial Narrow", size = 12),
  )-> fig_violin_charts_b

fig_violin_charts_b

ggsave(plot = fig_violin_charts_b, filename = "fig_violin_charts_b.pdf", dpi = 600, width = 7015, height = 4960, units = "px")

```

"

"

```{r, echo = F, message = F}

ruaf_analysis_full |>
  filter(rank_known == 1, prob_95pct != "Not classified") |> 
  group_by(
    prob_95pct, rank_large_f
  ) |> 
  count() |> 
  ungroup() |> 
  group_by(prob_95pct) |> 
  mutate(
    proportion = paste0(round(n/sum(n), 3)*100, "%")
  ) |> 
  ungroup() |>
  dplyr::select(-n) |> 
    pivot_wider(
    names_from = "prob_95pct",
    values_from = "proportion"
  ) |> 
  mutate(
    dataset = "Memorial"
  ) |> 
  set_names(
    c(
      "rank", "NESE", "Slavic", "dataset"
    )
  )-> props_for_table_memorial


ruaf_analysis_full |>
    mutate(
    BelRusUkr_cat = case_when(
      agr_ethnos == "BelRusUkr" ~ "ES soldiers",
      agr_ethnos == "INVALID" ~ "Not classified",
      T ~ "NESE soldiers"
    )
  ) |> 
  filter(rank_known == 1, BelRusUkr_cat != "Not classified") |> 
  group_by(
    BelRusUkr_cat, rank_large_f
  ) |> 
  count() |> 
  ungroup() |> 
  group_by(BelRusUkr_cat) |> 
  mutate(
    proportion = paste0(round(n/sum(n), 3)*100, "%")
  ) |> 
  ungroup() |>
  dplyr::select(-n) |> 
    pivot_wider(
    names_from = "BelRusUkr_cat",
    values_from = "proportion"
  ) |> 
  mutate(
    dataset = "Bessudnov et al., Memorial subset"
  ) |> 
  set_names(
    "rank", "Slavic", "NESE", "dataset"
  )-> props_for_table_bessudnov_restricted


ruaf_analysis_full |>
    mutate(
    BelRusUkr_cat = case_when(
      agr_ethnos == "BelRusUkr" ~ "ES soldiers",
      agr_ethnos == "INVALID" ~ "Not classified",
      T ~ "NESE soldiers"
    )
  ) |> 
  filter(rank_known == 1, BelRusUkr_cat != "Not classified") |> 
  group_by(
    BelRusUkr_cat, rank_large_f
  ) |> 
  count() |> 
  ungroup() |> 
  group_by(BelRusUkr_cat) |> 
  mutate(
    proportion = paste0(round(n/sum(n), 3)*100, "%")
  ) |> 
  ungroup() |>
  dplyr::select(-n) |> 
    pivot_wider(
    names_from = "BelRusUkr_cat",
    values_from = "proportion"
  ) |> 
  mutate(
    dataset = "Bessudnov et al., full dataset"
  ) |> 
  set_names(
    "rank", "Slavic", "NESE", "dataset"
  )-> props_for_table_bessudnov_full

props_for_table_memorial |> 
  bind_rows(props_for_table_bessudnov_restricted, props_for_table_bessudnov_full) |> 
  pivot_wider(
    names_from = "rank",
    values_from = c("Slavic", "NESE"), names_sep = " ", names_glue = "{.value} {rank} (%)"
  ) |>
  dplyr::select(
    "dataset", "Slavic Enlisted (%)", "NESE Enlisted (%)", "Slavic Sergeants (%)", "NESE Sergeants (%)", "Slavic Junior officers (%)", "NESE Junior officers (%)", "Slavic Senior officers (%)", "NESE Senior officers (%)", "Slavic General officers (%)", "NESE General officers (%)"
  ) |> 
  gt() -> fig_natl_rank_comparison


fig_natl_rank_comparison |> 
  tab_header(
    "Distribution of ranks by inferred nationality (Memorial and Bessudnov et. al. classifiers). Memorial classifier uses 95% probability as a threshold to delineate an inferred nationality status of an Eastern Slav / national minority personnel."
  )

fig_natl_rank_comparison |> 
  as_latex() |>
  writeLines(con = "fig-natl-rank-comparison.tex")

```

# Full Model Output

```{r, echo = F, message = F, cache = T, eval = T}
## Currently, this chunk is commented out since it takes a considerable amount of time to run full models. Instead, model results are saved as separate .rds files and then loaded back to make output tables. Please change "eval = F" to "eval = T" to replicate the analysis.

## Model 1 FE: Simple demographic model with Memorial nationalities, fixed effects on geography

mod_1_fe <- ordinal::clmm(rank_large_f~branch+sex + Prob_non_es_memo + (1|geography), data = ruaf_analysis_full)

## Model 1: Simple demographic model with Memorial nationalities

model_1 <- MASS::polr(rank_large_f~branch+sex+Prob_non_es_memo, data = ruaf_analysis_full)

## Model 2: Simple demographic model with Bessudnov et al nationalities

model_2  <- MASS::polr(rank_large_f~branch+sex+birth_cohorts+Prob_non_es_bess, data = ruaf_analysis_full)

## Model 2: Simple demographic model with Bessudnov et al nationalities, geography FEs

model_2_fe  <- ordinal::clmm(rank_large_f~branch+sex+birth_cohorts+Prob_non_es_bess + (1|geography), data = ruaf_analysis_full)

## Model 3: Demographic + birth cohorts + Income/large city + Ethnic/demographic + nat/republic controls, with Memorial nationalities

model_3 <- MASS::polr(rank_large_f~branch+sex+birth_cohorts+Prob_non_es_memo+category+quintile_group + millionaires + national_republics, data = ruaf_analysis_full)

## Model 4: Demographic + Income/large city + Ethnic/demographic + nat/republic controls, with Memorial nationalities X birth cohorts interaction

model_4 <- MASS::polr(rank_large_f~branch+sex+birth_cohorts*Prob_non_es_memo+category+quintile_group + millionaires + national_republics, data = ruaf_analysis_full)

## Impute data for model robustness check where nationality is missing in the Memorial data

ruaf_analysis_full |> 
  mutate(
    all_es = case_when(
      is.na(Prob_non_es_memo) ~ 0,
      T ~ as.double(Prob_non_es_memo)
    ),
    all_nese = case_when(
      is.na(Prob_non_es_memo) ~ 1,
      T ~ as.double(Prob_non_es_memo)
    )
  ) -> ruaf_imp

## Model 5: Same as Model 4, but imputes all unclassified records as NESE

model_5 <- MASS::polr(rank_large_f~branch+sex+birth_cohorts*all_nese+category+quintile_group + millionaires + national_republics, data = ruaf_imp)

## Model 6: Same as Model 4, but imputes all unclassified records as ES

model_6 <- MASS::polr(rank_large_f~branch+sex+birth_cohorts*all_es+category+quintile_group + millionaires + national_republics, data = ruaf_imp)

## Model 7: Full model with all controls, with categorical nationalities based on the Bessudnov et al approach.

model_7 <- MASS::polr(rank_large_f~branch+agr_ethnos_f+sex+birth_cohorts+category+quintile_group + millionaires + national_republics, data = ruaf_analysis_full)



## Assemble table data
row_names <- c(
  "NESE (Memorial) ", "NESE (B et al) ", "Gender Controls ", "Branch Controls ", "Age Cohort Controls ", "Place of Registration: Income/ Large City Controls", "Place of Registration: Ethnic Demography Controls", "Place of Registration: National Republic Controls", "NESE*[b.1975-1980] ", "NESE*[b.1981-1984] ", "NESE*[b.1985-1990] ", "NESE*[b. 1991-1994] ", "NESE*[b. after 1995] ", "Recoding Rule: All unclassified = NESE", "Recoding Rule: All unclassified = ES",  "Tatar/Bashkir", "Jews", "Chechen/Ingush/Daghestani", "Tajik/Uzbek", "Buryat", "Yakut", "McFadden's Pseudo R^2", "N"
)



## Additional robustness checks for officers vs. others and junior vs. senior officers.
ruaf_analysis_full |> 
  mutate(
    officers_vs_others = case_when(
      rank_num %in% c(8:15) ~ 1,
      rank_num %in% c(0:7) ~ 0,
      T ~ NA_real_
    ),
    junior_vs_senior = case_when(
      rank_num %in% (12:15) ~ 1,
      rank_num %in% c(8:11) ~ 0,
      T ~ NA_real_
    )
  ) -> ruaf_analysis_full_rc


model_3_officers_others <- glm(officers_vs_others~branch+sex+birth_cohorts+Prob_non_es_memo+category+quintile_group + millionaires + national_republics, data = ruaf_analysis_full_rc, family = "gaussian")

model_7_officers_others <- glm(officers_vs_others~branch+agr_ethnos_f+sex+birth_cohorts+category+quintile_group + millionaires + national_republics, data = ruaf_analysis_full_rc, family = "gaussian")

model_3_officers <- glm(junior_vs_senior~branch+sex+birth_cohorts+Prob_non_es_memo+category+quintile_group + millionaires + national_republics, data = ruaf_analysis_full_rc, family = "gaussian")

model_7_officers <- glm(junior_vs_senior~branch+agr_ethnos_f+sex+birth_cohorts+category+quintile_group + millionaires + national_republics, data = ruaf_analysis_full_rc, family = "gaussian")




## Calculate McFadden's Pseudo R^2

model_1_psr2 <- pR2(model_1)
model_2_psr2 <- pR2(model_2)
model_3_psr2 <- pR2(model_3)
model_4_psr2 <- pR2(model_4)
model_5_psr2 <- pR2(model_5)
model_6_psr2 <- pR2(model_6)
model_7_psr2 <- pR2(model_7)


## McFadden's Pseudo R^2 for fixed effects models

# Null model

null_mod <- clmm(rank_large_f ~ 1 + (1 | geography), data = ruaf_analysis_full)
logLik_null <- logLik(null_mod)

# Model 1
logLik_full_mod1_fe <- logLik(mod_1_fe)

# Model 2
logLik_full_mod2_fe <- logLik(model_2_fe)

model_1_fe_r2 <- 1 - as.numeric(logLik_full_mod1_fe / logLik_null)
model_2_fe_r2 <- 1 - as.numeric(logLik_full_mod2_fe / logLik_null)

write_rds(model_1, "model_1.rds")
write_rds(mod_1_fe, "model_1_fe.rds")
write_rds(model_2, "model_2.rds")
write_rds(model_2_fe, "model_2_fe.rds")
write_rds(model_3, "model_3.rds")
write_rds(model_3_officers_others, "model_3_officers_others.rds")
write_rds(model_3_officers, "model_3_officers.rds")
write_rds(model_4, "model_4.rds")
write_rds(model_5, "model_5.rds")
write_rds(model_6, "model_6.rds")
write_rds(model_7, "model_7.rds")
write_rds(model_7_officers_others, "model_7_officers_others.rds")
write_rds(model_7_officers, "model_7_officers.rds")


```

"

"

```{r, echo = F, message = F}

model_1<- read_rds("model_1.rds")
model_1_fe<- read_rds("model_1_fe.rds")
model_2<- read_rds("model_2.rds")
model_2_fe<- read_rds("model_2_fe.rds")
model_3<- read_rds("model_3.rds")
model_3_officers_others<- read_rds("model_3_officers_others.rds")
model_3_officers<- read_rds("model_3_officers.rds")
model_4<- read_rds("model_4.rds")
model_5<- read_rds("model_5.rds")
model_6<- read_rds("model_6.rds")
model_7<- read_rds("model_7.rds")
model_7_officers_others<- read_rds("model_7_officers_others.rds")
model_7_officers<- read_rds("model_7_officers.rds")

htmlreg(
  l = list(
    model_1_fe, model_2_fe, model_3, model_4, model_5, model_6, model_7
  ),
  custom.model.names = c(
    "Model 1: Baseline with Memorial nationalities (region fixed effects)",
    "Model 2: Baseline with Bessudnov et. al. nationalities (region fixed effects)",
    "Model 3: Demographic + birth cohorts + Income/large city + Ethnic/demographic + nat/republic controls, with Memorial nationalities",
    "Model 4: Demographic + Income/large city + Ethnic/demographic + natl.republic controls, with Memorial nationalities X birth cohorts interaction",
    "Model 5: Same as Model 4, but imputes all unclassified records as NESE",
    "Model 6: Same as Model 4, but imputes all unclassified records as ES",
    "Model 7: Full model with all controls, with categorical nationalities based on the Bessudnov et al approach."
  ),
  digits = 3,
  custom.note = "Standard errors in parentheses. * p < 0.05, ** p < 0.01, *** p < 0.001",
  file = "all_models.html"
) -> html_tab



```

::: {#tbl-main-reg}
```{=html}

{{< include "all_models.html" >}}
```

Main regression models referenced in the manuscript.
:::

```{r, echo = F, message = F}
htmlreg(
  l = list(
    model_3_officers_others, model_3_officers, model_7_officers_others, model_7_officers
  ),
  custom.model.names = c(
    "Model 3A: Officers vs. others, baseline with Memorial nationalities (region fixed effects)",
    "Model 3B: Junior vs. Senior officers, with Memorial nationalities (region fixed effects)",
    "Model 7A: Officers vs. others, full model with all controls, with categorical nationalities based on the Bessudnov et al approach.",
    "Model 7B: Junior vs. Senior officers, full model with all controls, with categorical nationalities based on the Bessudnov et al approach."
  ),
  digits = 3,
  custom.note = "Standard errors in parentheses. * p < 0.05, ** p < 0.01, *** p < 0.001",
  file = "rob_check.html"
)



```

::: {#tbl-rob-reg}
```{=html}

{{< include "rob_check.html" >}}
```

Robustness checks: Binary outcome variables with Gaussian link
:::

"

"

```{r fig-ethnos-pp-mod3-75-80, echo = F, message = F,  fig.dpi=600, fig.cap='Predicted probability of having a rank by inferred nationality by the Memorial classifier and administrative region type. Branch is kept at "Army," birth cohort is fixed at 1975-1980.'}


ggeffects::ggpredict(model_3, terms = c("Prob_non_es_memo[0.05, 0.95]", "branch[Army]", "birth_cohorts[1975-1980]",  "national_republics[0, 1]")) -> mod3_pp_birth_1975_1980

mod3_pp_birth_1975_1980 |>
  ungroup() |> 
  data.frame() |>
  mutate(
    ranks = case_when(
      response.level == "Enlisted" ~ 1,
      response.level == "Sergeants" ~ 2,
      response.level == "Junior officers" ~ 3,
      response.level == "Senior officers" ~ 4,
      response.level == "General officers" ~ 5,
    ),
    ranks = factor(ranks, levels = c(1:5), labels = c("Enlisted", "Sergeants", "Junior officers", "Senior officers", "General officers")),
    ethnicity = factor(x, levels = c(0.05, 0.95), labels = c("Eastern Slavic", "Minority")),
    nat_rep = factor(panel, levels = c(1, 0), labels = c("National republics", "Other administrative units"))
  ) |> 
  ggplot(aes(ethnicity, predicted, shape = nat_rep))+
  geom_point(position = position_dodge(width = 0.5), size = 2)+
  geom_linerange(aes(ymin = conf.low, ymax = conf.high), position = position_dodge(width = 0.5))+
  guides(color=guide_legend(title="Classification of a region"))+
  facet_wrap(~ranks)+
  labs(
    shape = NULL,
    x = "Inferred nationality (95% algorithmic confidence)",
    y = "Probability of having a rank"
  ) +
  theme_bw()+
  theme(
    legend.position = "bottom",
    axis.text = element_text(family = "Arial Narrow", size = 9),
    axis.title = element_text(family = "Arial Narrow", size = 12),
    strip.text = element_text(family = "Arial Narrow", size = 14),
    legend.text = element_text(family = "Arial Narrow", size = 9)
  )-> fig_pp_ethnos_1

fig_pp_ethnos_1

ggsave(plot = fig_pp_ethnos_1, filename = "fig-ethnos-pp-mod3-75-80.pdf", dpi = 600, width = 7015, height = 4960, units = "px")


```

"

"

```{r fig-ethnos-pp-mod3-81-84, echo = F, message = F,  fig.dpi=600, fig.cap='Predicted probability of having a rank by inferred nationality by the Memorial classifier and administrative region type. Branch is kept at "Army," birth cohort is fixed at 1981-1984.'}

ggeffects::ggpredict(model_3, terms = c("Prob_non_es_memo[0.05, 0.95]", "branch[Army]", "birth_cohorts[1981-1984]",  "national_republics[0, 1]")) -> mod3_pp_birth_1981_1984

mod3_pp_birth_1981_1984 |>
  ungroup() |> 
  data.frame() |>
  mutate(
    ranks = case_when(
      response.level == "Enlisted" ~ 1,
      response.level == "Sergeants" ~ 2,
      response.level == "Junior officers" ~ 3,
      response.level == "Senior officers" ~ 4,
      response.level == "General officers" ~ 5,
    ),
    ranks = factor(ranks, levels = c(1:5), labels = c("Enlisted", "Sergeants", "Junior officers", "Senior officers", "General officers")),
    ethnicity = factor(x, levels = c(0.05, 0.95), labels = c("Eastern Slavic", "Minority")),
    nat_rep = factor(panel, levels = c(1, 0), labels = c("National republics", "Other administrative units"))
  ) |> 
  ggplot(aes(ethnicity, predicted, shape = nat_rep))+
  geom_point(position = position_dodge(width = 0.5), size = 2)+
  geom_linerange(aes(ymin = conf.low, ymax = conf.high), position = position_dodge(width = 0.5))+
  guides(color=guide_legend(title="Classification of a region"))+
  facet_wrap(~ranks)+
  labs(
    shape = NULL,
    x = "Inferred nationality (95% algorithmic confidence)",
    y = "Probability of having a rank"
  ) +
  theme_bw()+
  theme(
    legend.position = "bottom",
    axis.text = element_text(family = "Arial Narrow", size = 9),
    axis.title = element_text(family = "Arial Narrow", size = 12),
    strip.text = element_text(family = "Arial Narrow", size = 14),
    legend.text = element_text(family = "Arial Narrow", size = 9)
  )-> fig_pp_ethnos_2

fig_pp_ethnos_2

ggsave(plot = fig_pp_ethnos_2, filename = "fig-ethnos-pp-mod3-81-84.pdf", dpi = 600, width = 7015, height = 4960, units = "px")


```

"

"

```{r fig-ethnos-pp-mod3-85-90, echo = F, message = F, fig.dpi = 600, fig.cap = 'Predicted probability of having a rank by inferred nationality by the Memorial classifier and administrative region type. Branch is kept at "Army," birth cohort is fixed at 1985-1990.'}

ggeffects::ggpredict(model_3, terms = c("Prob_non_es_memo[0.05, 0.95]", "branch[Army]", "birth_cohorts[1985-1990]",  "national_republics[0, 1]")) -> mod3_pp_birth_1985_1990

mod3_pp_birth_1985_1990 |>
  ungroup() |> 
  data.frame() |>
  mutate(
    ranks = case_when(
      response.level == "Enlisted" ~ 1,
      response.level == "Sergeants" ~ 2,
      response.level == "Junior officers" ~ 3,
      response.level == "Senior officers" ~ 4,
      response.level == "General officers" ~ 5,
    ),
    ranks = factor(ranks, levels = c(1:5), labels = c("Enlisted", "Sergeants", "Junior officers", "Senior officers", "General officers")),
    ethnicity = factor(x, levels = c(0.05, 0.95), labels = c("Eastern Slavic", "Minority")),
    nat_rep = factor(panel, levels = c(1, 0), labels = c("National republics", "Other administrative units"))
  ) |> 
  ggplot(aes(ethnicity, predicted, shape = nat_rep))+
  geom_point(position = position_dodge(width = 0.5), size = 2)+
  geom_linerange(aes(ymin = conf.low, ymax = conf.high), position = position_dodge(width = 0.5))+
  guides(color=guide_legend(title="Classification of a region"))+
  facet_wrap(~ranks)+
  labs(
    shape = NULL,
    x = "Inferred nationality (95% algorithmic confidence)",
    y = "Probability of having a rank"
  ) +
  theme_bw()+
  theme(
    legend.position = "bottom",
    axis.text = element_text(family = "Arial Narrow", size = 9),
    axis.title = element_text(family = "Arial Narrow", size = 12),
    strip.text = element_text(family = "Arial Narrow", size = 14),
    legend.text = element_text(family = "Arial Narrow", size = 9)
  )-> fig_pp_ethnos_3

fig_pp_ethnos_3

ggsave(plot = fig_pp_ethnos_3, filename = "fig-ethnos-pp-mod3-85-90.pdf", dpi = 600, width = 7015, height = 4960, units = "px")


```

"

"

```{r fig-ethnos-pp-mod7, echo = F, message =F, fig.dpi = 600, fig.cap = 'Average marginal effects of a nationality on having a particular rank. Full model with all controls, with categorical nationalities based on the Bessudnov et al approach. All other controls are kept at their means.'}

avg_slopes(model_7, "agr_ethnos_f", newdata = "mean") -> slopes_mean_model_7

slopes_mean_model_7 |>
  mutate(
    contrast = str_replace_all(contrast, "mean\\(", ""),
    contrast = str_replace_all(contrast, "\\)", ""),
    contrast = str_replace_all(contrast, " - BelRusUkr", ""),
    contrast = case_when(
      contrast == "TajUzb" ~ "Tajik or Uzbek",
      contrast == "KazKyr" ~ "Kazakh or Kyrgyz",
      contrast == "KabAdKarBalOs" ~ "Kabardan, Adyghe, Karachai, Balkar or Ossetian",
      contrast == "CheDagIng" ~ "Chechen, Ingush or Daghestani",
      contrast == "BashTat" ~ "Bashkir or Tatar",
      T ~ as.character(contrast)
    )
  ) |> 
    filter(
    group != "General officers", contrast != "INVALID"
  ) |>
  ggplot(
    aes(x = estimate, y = contrast)
  ) +
  geom_point() +
  geom_linerange(
    aes( y = contrast, xmin = conf.low, xmax = conf.high)
  ) + 
  facet_wrap(~group) + 
  geom_vline(xintercept = 0)+
  labs(
    x = "Estimated average marginal effect.\nLine ranges denote 95% CIs"
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    plot.caption = element_text(family = "Arial Narrow", size = 7),
    axis.text = element_text(family = "Arial Narrow", size = 9),
    axis.title = element_text(family = "Arial Narrow", size = 12),
    axis.title.y = element_blank(),
    strip.text = element_text(family = "Arial Narrow", size = 14),
    strip.placement = "outside",
    strip.background = element_blank()
  )-> fig_pp_ethnos_m7

fig_pp_ethnos_m7

ggsave(plot = fig_pp_ethnos_m7, filename = "fig-ethnos-pp-mod7.pdf", dpi = 600, width = 7015, height = 4960, units = "px")


```

# Bibliography

```{r, echo = F, message = F, results = 'asis'}
#| echo: false
#| 
map(required_packages, function(pkg) {
  out <- capture.output(print(citation(pkg), style = "text"))
  cat(out, sep = "\n")
  cat("\n\n")
})
```
